import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets, model_selection, ensemble


def load_classification_data():
    '''
    加载分类模型使用的数据集
    return 一个元组，依次为：训练样本集 测试样本集 测试样本的标记 测试样本的标记
    '''
    digits = datasets.load_digits()
    return model_selection.train_test_split(digits.data, digits.target, test_size=0.25, random_state=0,
                                             stratify=digits.target)


def test_AdaBoostClassifier(*data):
    '''
    测试AdaBoostClassifier的用法 ，绘制测试AdaBoostClassifier的预测性能随基础分类器的数量的影响
    param data 可变参数 一个元组 训练样本集.测试样本集,训练样本的标记.测试样本的标记
    return None
    '''
    x_train, x_test, y_train, y_test = data
    clf = ensemble.AdaBoostClassifier(learning_rate=0.1)  # 参数 学习率
    clf.fit(x_train, y_train)
    # 绘图
    fig = plt.figure()
    ax = fig.add_subplot(111)
    estimators_nums = len(clf.estimators_)  # 学习器的数量
    x = range(1, estimators_nums + 1)
    ax.plot(list(x), list(clf.staged_score(x_train, y_train)), label='train score')  # 返回X，y的分阶段分数。
    ax.plot(list(x), list(clf.staged_score(x_test, y_test)), label='test score')
    ax.set_xlabel('estimators num')
    ax.set_ylabel('score')
    ax.legend(loc='best')
    ax.set_title('AdaBoostClassifier')
    plt.show()


X_train, X_test, y_train, y_test = load_classification_data()  # 获取分类数据
test_AdaBoostClassifier(X_train, X_test, y_train, y_test)


def test_AdaBoostClassifier_base_classifier(*data):
    '''
    测试AdaBoostClassifier的用法，绘制测试AdaBoostClassifier的预测性能随基础分类器数量的影响

    :param data:可变参数。它是一个元组，这里要求其元素依次为：训练样本集.测试样本集,训练样本的标记.测试样本的标记
    return: None
    '''
    from sklearn.naive_bayes import GaussianNB  # 朴素贝叶斯分类器
    X_train, X_test, y_train, y_test = data
    fig = plt.figure(figsize=(10, 20))
    ax = fig.add_subplot(2, 1, 1)
    ###############默认的个体分类器################
    clf = ensemble.AdaBoostClassifier(learning_rate=0.1)
    clf.fit(X_train, y_train)
    # 绘图
    estimators_num = len(clf.estimators_)  # 学习器数量
    X = range(1, estimators_num + 1)
    ax.plot(list(X), list(clf.staged_score(X_train, y_train)), label='Traing score')
    ax.plot(list(X), list(clf.staged_score(X_test, y_test)), label='Testing score')
    ax.set_xlabel('estimators num')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0, 1)
    ax.set_title('AdaBoostClassifier with Decision Tree')
    ###############Gaussian  Naive  Bayes  个体分类器  ######
    ax = fig.add_subplot(2, 1, 2)
    clf = ensemble.AdaBoostClassifier(learning_rate=0.1, base_estimator=GaussianNB())
    clf.fit(X_train, y_train)
    # 绘图
    estimators_num = len(clf.estimators_)  # 学习器数量
    X = range(1, estimators_num + 1)
    ax.plot(list(X), list(clf.staged_score(X_train, y_train)), label='Traing score')
    ax.plot(list(X), list(clf.staged_score(X_test, y_test)), label='Testing score')
    ax.set_xlabel('estimators num')
    ax.set_ylabel('score')
    ax.legend(loc='lower right')
    ax.set_ylim(0, 1)
    ax.set_title('AdaBoostClassifier with Decision Tree')
    plt.show()


test_AdaBoostClassifier_base_classifier(X_train, X_test, y_train, y_test)


def test_AdaBoostClassifier_learn_rate(*data):
    '''
    测试AdaBoostClassifier 分类器数量一定的情况下 不同的学习率的影响
    param data 可变参数。它是一个元组，这里要求其元素依次为：训练样本集.测试样本集,训练样本的标记.测试样本的标记
    return None
    '''
    x_train, x_test, y_train, y_test = data
    learn_rates = np.linspace(0.01, 1)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    train_score = []
    test_score = []
    for rate in learn_rates:
        clf = ensemble.AdaBoostClassifier(learning_rate=rate, n_estimators=500)
        clf.fit(x_train, y_train)
        train_score.append(clf.score(x_train, y_train))
        test_score.append(clf.score(x_test, y_test))
    ax.plot(learn_rates, train_score, label="train_score")
    ax.plot(learn_rates, test_score, label="test_score")
    ax.set_xlabel("learn_rate")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("AdaBoostClassifier")
    plt.show()


test_AdaBoostClassifier_learn_rate(X_train, X_test, y_train, y_test)


def test_AdaBoostClassifier_learn_rate(*data):
    '''
    测试AdaBoostClassifier 分类器数量一定的情况下 不同的学习率的影响
    param data 可变参数。它是一个元组，这里要求其元素依次为：训练样本集.测试样本集,训练样本的标记.测试样本的标记
    return None
    '''
    x_train, x_test, y_train, y_test = data
    learn_rates = np.linspace(0.01, 1)
    fig = plt.figure()
    ax = fig.add_subplot(1, 1, 1)
    train_score = []
    test_score = []
    for rate in learn_rates:
        clf = ensemble.AdaBoostClassifier(learning_rate=rate, n_estimators=500, algorithm='SAMME')
        clf.fit(x_train, y_train)
        train_score.append(clf.score(x_train, y_train))
        test_score.append(clf.score(x_test, y_test))
    ax.plot(learn_rates, train_score, label="train_score")
    ax.plot(learn_rates, test_score, label="test_score")
    ax.set_xlabel("learn_rate")
    ax.set_ylabel("score")
    ax.legend(loc="best")
    ax.set_title("AdaBoostClassifier")
    plt.show()


test_AdaBoostClassifier_learn_rate(X_train, X_test, y_train, y_test)